1 Learning Objectives - Explore

Exploratory Data Analysis (cont’d): - Pairs plot to show correlation between variables and avoid multicollinearity (see 8.2 Many predictors in a model) Logistic Regression seen as an evolution of techniques - Linear Model to show simplest multivariate regression, but predictions can be outside the binary values. - Generalized Linear Model uses a logit transformation to constrain the outputs to being within two values. - Generalized Additive Model allows for “wiggle” in predictor terms. - Maxent (Maximum Entropy) is a presence-only modeling technique that allows for a more complex set of shapes between predictor and response.

2 Load packages and environmental data layers

librarian::shelf(
  DT, dplyr, dismo, GGally, here, readr, tidyr)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)

dir_data    <- here("data")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 1552

2.1 Displate environmental data in a table

datatable(pts_env, rownames = F)

2.2 Use GGally to look at pair plots and examine correlations between environmental variables

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))

# Logistic Regression ## Setup Data - drop rows of data with any NA values (later, we’ll learn how to impute values) - remove terms we don’t want to model

d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 1548

2.3 Linear Model

  • dependent variable Y: presence/absence, aka 1/0
  • independent variables X: everything else in the dataframe, aka the environmental layers
mod <- lm(present ~ ., data = d)
summary(mod)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.06039 -0.30499  0.06589  0.28673  1.08016 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -5.668e-01  9.043e-01  -0.627  0.53093    
## WC_alt                   -5.757e-05  1.022e-04  -0.563  0.57327    
## WC_bio1                   1.650e-01  1.918e-02   8.603  < 2e-16 ***
## WC_bio4                  -2.082e-03  2.131e-03  -0.977  0.32876    
## WC_bio12                 -1.889e-03  2.311e-04  -8.175 6.11e-16 ***
## ER_climaticMoistureIndex  1.622e+00  2.271e-01   7.140 1.43e-12 ***
## ER_tri                    4.292e-03  1.405e-03   3.055  0.00229 ** 
## ER_topoWet                9.895e-03  1.621e-02   0.610  0.54163    
## lon                       2.432e-02  2.519e-03   9.654  < 2e-16 ***
## lat                       8.610e-02  1.472e-02   5.850 6.00e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4042 on 1538 degrees of freedom
## Multiple R-squared:  0.3507, Adjusted R-squared:  0.3469 
## F-statistic: 92.31 on 9 and 1538 DF,  p-value: < 2.2e-16

Note: a linear model is ineffective because it predicts values outside the 0 - 1 range

y_predict <- predict(mod, pts_env, type="response")
y_true    <- pts_env$present

range(y_predict)
## [1] NA NA
range(y_true)
## [1] 0 1

3 Generalized Linear Model

To solve this problem, we will apply a Logit Transformation

# fit a generalized linear model with a binomial logit link function
mod <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mod)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4482  -0.7997  -0.1347   0.7655   2.5857  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.9805150  6.3935208  -0.466  0.64109    
## WC_alt                   -0.0007000  0.0006809  -1.028  0.30393    
## WC_bio1                   0.8844692  0.1320449   6.698 2.11e-11 ***
## WC_bio4                   0.0060152  0.0134718   0.446  0.65524    
## WC_bio12                 -0.0112959  0.0015053  -7.504 6.18e-14 ***
## ER_climaticMoistureIndex 10.3772734  1.5333428   6.768 1.31e-11 ***
## ER_tri                    0.0287820  0.0093305   3.085  0.00204 ** 
## ER_topoWet                0.0731181  0.1003126   0.729  0.46606    
## lon                       0.1188077  0.0160109   7.420 1.17e-13 ***
## lat                       0.3553762  0.1109841   3.202  0.00136 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2146.0  on 1547  degrees of freedom
## Residual deviance: 1512.3  on 1538  degrees of freedom
## AIC: 1532.3
## 
## Number of Fisher Scoring iterations: 5

Note: we are not within the 0 - 1 range we want to be in

y_predict <- predict(mod, d, type="response")
range(y_predict)
## [1] 0.006823366 0.950054989

3.1 Term Plots

termplot(mod, partial.resid = TRUE, se = TRUE, main = F)

4 Generalized Additive Model (GAM)

We can further improve the GLM by adding “wiggle” to the relationship between the predictor and response variables

librarian::shelf(mgcv)

# fit a generalized additive model with smooth predictors
mod <- mgcv::gam(
  formula = present ~ s(WC_alt) + 
                      s(WC_bio1) + 
                      s(WC_bio4) + 
                      s(WC_bio12) + 
                      s(ER_climaticMoistureIndex) + 
                      s(ER_tri) + 
                      s(ER_topoWet) + 
                      s(lon) + 
                      s(lat), 
  family = binomial, data = d)
summary(mod)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio4) + s(WC_bio12) + 
##     s(ER_climaticMoistureIndex) + s(ER_tri) + s(ER_topoWet) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -74.78      36.89  -2.027   0.0427 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                               edf Ref.df Chi.sq  p-value    
## s(WC_alt)                   2.564  3.201  6.335 0.119962    
## s(WC_bio1)                  4.207  5.165 23.346 0.000436 ***
## s(WC_bio4)                  5.911  7.076 30.629 8.15e-05 ***
## s(WC_bio12)                 7.845  8.500 40.908  < 2e-16 ***
## s(ER_climaticMoistureIndex) 7.694  8.129 20.069 0.010477 *  
## s(ER_tri)                   8.455  8.711 30.665 0.000500 ***
## s(ER_topoWet)               7.710  8.091 15.253 0.075929 .  
## s(lon)                      8.877  8.982 73.399  < 2e-16 ***
## s(lat)                      5.175  6.313 15.520 0.015247 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.541   Deviance explained = 49.8%
## UBRE = -0.22746  Scale est. = 1         n = 1548

4.1 Term Plots

plot(mod, scale=0)

5 Maxent (Max Entropy)

This is the most commonly used species distribution model because it requires few input data points, all of which can be presence observation points, and is easy to use with a Java GUI.

# load extra packages
librarian::shelf(
  maptools, sf)

# show version of maxent
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
mod <- maxent(env_stack, obs_sp)
## This is MaxEnt version 3.4.3
# plot variable contributions per predictor
plot(mod)

6 Term Plots

# plot term plots
response(mod)

## Predict

# predict
y_predict <- predict(env_stack, mod) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')